This R Markdown document reads, combines, and summarizes multiple _summary.csv files. *** ## 0. Packages

This section loads the packages used for import, manipulation, and plotting.

library(tidyverse)

# Make plots larger and labels more legible throughout the report
knitr::opts_chunk$set(
  fig.width = 16,
  fig.height = 8,
  dpi = 200,
  out.width = "100%",
  fig.align = "center"
)

# Ensure rmarkdown can find Pandoc.
# On Windows, Quarto bundles pandoc in: <quarto>/bin/tools/pandoc.exe.
if (!nzchar(Sys.which("pandoc"))) {
  quarto_exe <- Sys.which("quarto")
  if (nzchar(quarto_exe)) {
    quarto_tools <- normalizePath(
      file.path(dirname(quarto_exe), "tools"),
      winslash = "/",
      mustWork = FALSE
    )
    if (dir.exists(quarto_tools)) {
      Sys.setenv(RSTUDIO_PANDOC = quarto_tools)
    }
  }
}

1. Read and combine ALL CSVs

Reads all _summary.csv files from folder and combines them into a single master table dat_raw.

folder <- "C:\\Users\\dunnmk\\University of Michigan Dropbox\\MED-WILSONTELAB\\wilsontelab box\\Common\\Projects\\Yeast Aim 3\\3C Sequencing Data\\3C_summary"
files  <- list.files(folder, pattern = "_summary\\.csv$", full.names = TRUE)
raw_list <- lapply(files, read_csv)
## Rows: 76 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 76 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 74 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 74 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 74 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 74 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat_raw <- bind_rows(raw_list)
dat_raw <- dat_raw %>%
  mutate(batch = factor(batch, levels = sort(unique(batch))))
str(dat_raw)
## tibble [448 × 10] (S3: tbl_df/tbl/data.frame)
##  $ batch         : Factor w/ 3 levels "4","6","8": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DSB2_loci     : chr [1:448] "chr7" "chr7" "chr7" "chr7" ...
##  $ time_point    : num [1:448] 0 0 0 0 0 0 0 0 0 0 ...
##  $ replicate     : num [1:448] 1 1 1 1 1 1 1 1 1 1 ...
##  $ alignment_name: chr [1:448] "CIS_A_to_B_DSB1_Chr12_L01" "CIS_A_to_B_DSB1_Chr12_L05" "CIS_A_to_B_DSB1_Chr12_L07" "CIS_A_to_B_DSB1_Chr12_L12" ...
##  $ cis_trans     : chr [1:448] "CIS" "CIS" "CIS" "CIS" ...
##  $ DSB           : chr [1:448] "DSB1" "DSB1" "DSB1" "DSB1" ...
##  $ combo         : chr [1:448] "A_to_B" "A_to_B" "A_to_B" "A_to_B" ...
##  $ allele        : chr [1:448] "Chr12_L01" "Chr12_L05" "Chr12_L07" "Chr12_L12" ...
##  $ count         : num [1:448] 23365 15119 19481 22779 27868 ...
dat_raw |> head(10) |> knitr::kable(caption = "Preview: combined raw `_summary.csv` rows (first 10)")
Preview: combined raw _summary.csv rows (first 10)
batch DSB2_loci time_point replicate alignment_name cis_trans DSB combo allele count
4 chr7 0 1 CIS_A_to_B_DSB1_Chr12_L01 CIS DSB1 A_to_B Chr12_L01 23365
4 chr7 0 1 CIS_A_to_B_DSB1_Chr12_L05 CIS DSB1 A_to_B Chr12_L05 15119
4 chr7 0 1 CIS_A_to_B_DSB1_Chr12_L07 CIS DSB1 A_to_B Chr12_L07 19481
4 chr7 0 1 CIS_A_to_B_DSB1_Chr12_L12 CIS DSB1 A_to_B Chr12_L12 22779
4 chr7 0 1 CIS_A_to_B_DSB1_Chr15_L01 CIS DSB1 A_to_B Chr15_L01 27868
4 chr7 0 1 CIS_A_to_B_DSB1_Chr15_L04 CIS DSB1 A_to_B Chr15_L04 54
4 chr7 0 1 CIS_A_to_B_DSB1_Chr15_L15 CIS DSB1 A_to_B Chr15_L15 14456
4 chr7 0 1 CIS_A_to_B_DSB1_Chr15_L17_2 CIS DSB1 A_to_B Chr15_L17_2 28338
4 chr7 0 1 CIS_A_to_B_DSB1_Chr15_L18 CIS DSB1 A_to_B Chr15_L18 4689
4 chr7 0 1 CIS_A_to_B_DSB1_Chr3_L02 CIS DSB1 A_to_B Chr3_L02 26669

2. Aggregate counts and percentages

Purpose- Calculate percentages from aggregated counts

summarize_counts_pct <- function(dat){
  dat %>%
    summarise(
      Total_Counts   = sum(count),
      Cis_Counts     = sum(count[cis_trans == "CIS"]),
      Trans_Counts   = sum(count[cis_trans == "TRANS"]),
      A_to_B_Counts  = sum(count[combo == "A_to_B"]),
      C_to_D_Counts  = sum(count[combo == "C_to_D"]),
      A_to_D_Counts  = sum(count[combo == "A_to_D"]),
      C_to_B_Counts  = sum(count[combo == "C_to_B"]),
      Percent_Cis    = if_else(Total_Counts > 0, 100 * Cis_Counts / Total_Counts, NA_real_),
      Percent_Trans  = if_else(Total_Counts > 0, 100 * Trans_Counts / Total_Counts, NA_real_),
      Percent_A_to_B = if_else(Total_Counts > 0, 100 * A_to_B_Counts / Total_Counts, NA_real_),
      Percent_C_to_D = if_else(Total_Counts > 0, 100 * C_to_D_Counts / Total_Counts, NA_real_),
      Percent_A_to_D = if_else(Total_Counts > 0, 100 * A_to_D_Counts / Total_Counts, NA_real_),
      Percent_C_to_B = if_else(Total_Counts > 0, 100 * C_to_B_Counts / Total_Counts, NA_real_),
      .groups = "drop"
    )
}
dat_agg_counts <- dat_raw %>%
  group_by(batch, time_point, DSB) %>%
  summarize_counts_pct()
str(dat_agg_counts)
## tibble [18 × 16] (S3: tbl_df/tbl/data.frame)
##  $ batch         : Factor w/ 3 levels "4","6","8": 1 1 1 1 1 1 2 2 2 2 ...
##  $ time_point    : num [1:18] 0 0 0 120 120 120 0 0 0 120 ...
##  $ DSB           : chr [1:18] "DSB1" "DSB2" "TRANS" "DSB1" ...
##  $ Total_Counts  : num [1:18] 499775 458690 3754 538320 483621 ...
##  $ Cis_Counts    : num [1:18] 499775 458690 0 538320 483621 ...
##  $ Trans_Counts  : num [1:18] 0 0 3754 0 0 ...
##  $ A_to_B_Counts : num [1:18] 499775 0 0 538320 0 ...
##  $ C_to_D_Counts : num [1:18] 0 458690 0 0 483621 ...
##  $ A_to_D_Counts : num [1:18] 0 0 3061 0 0 ...
##  $ C_to_B_Counts : num [1:18] 0 0 693 0 0 ...
##  $ Percent_Cis   : num [1:18] 100 100 0 100 100 0 100 100 0 100 ...
##  $ Percent_Trans : num [1:18] 0 0 100 0 0 100 0 0 100 0 ...
##  $ Percent_A_to_B: num [1:18] 100 0 0 100 0 0 100 0 0 100 ...
##  $ Percent_C_to_D: num [1:18] 0 100 0 0 100 0 0 100 0 0 ...
##  $ Percent_A_to_D: num [1:18] 0 0 81.5 0 0 ...
##  $ Percent_C_to_B: num [1:18] 0 0 18.5 0 0 ...
dat_agg_counts |> head(10) |> knitr::kable(caption = "Aggregated counts + derived percentages by batch × time_point × DSB (first 10)")
Aggregated counts + derived percentages by batch × time_point × DSB (first 10)
batch time_point DSB Total_Counts Cis_Counts Trans_Counts A_to_B_Counts C_to_D_Counts A_to_D_Counts C_to_B_Counts Percent_Cis Percent_Trans Percent_A_to_B Percent_C_to_D Percent_A_to_D Percent_C_to_B
4 0 DSB1 499775 499775 0 499775 0 0 0 100 0 100 0 0.00000 0.000000
4 0 DSB2 458690 458690 0 0 458690 0 0 100 0 0 100 0.00000 0.000000
4 0 TRANS 3754 0 3754 0 0 3061 693 0 100 0 0 81.53969 18.460309
4 120 DSB1 538320 538320 0 538320 0 0 0 100 0 100 0 0.00000 0.000000
4 120 DSB2 483621 483621 0 0 483621 0 0 100 0 0 100 0.00000 0.000000
4 120 TRANS 31673 0 31673 0 0 18323 13350 0 100 0 0 57.85054 42.149465
6 0 DSB1 223186 223186 0 223186 0 0 0 100 0 100 0 0.00000 0.000000
6 0 DSB2 167373 167373 0 0 167373 0 0 100 0 0 100 0.00000 0.000000
6 0 TRANS 11169 0 11169 0 0 10889 280 0 100 0 0 97.49306 2.506939
6 120 DSB1 169434 169434 0 169434 0 0 0 100 0 100 0 0.00000 0.000000

The table above is now aggregated to batch × time_point × DSB. ***

2.5 Batch-level QC plots (counts + composition)

Purpose- Quick sanity-check plots before allele-level cis/trans normalization.

# Aggregate from the master raw table
agg_qc <- dat_raw %>%
  group_by(batch, time_point, DSB) %>%
  summarize_counts_pct()

# From here on in QC, compute CIS/TRANS using ONLY the 4 combos of interest:
#   CIS   := A_to_B + C_to_D
#   TRANS := A_to_D + C_to_B
# and define percents relative to the total of these 4 combos.
#
# IMPORTANT: we intentionally DO NOT group by `DSB` here.
# In this dataset, `DSB` can include levels like "TRANS", and grouping by it
# will split CIS and TRANS into different facets and produce misleading 100% bars.
agg_qc_combo4 <- dat_raw %>%
  group_by(batch, time_point) %>%
  summarise(
    Total_All = sum(count),
    A_to_B = sum(count[combo == "A_to_B"]),
    C_to_D = sum(count[combo == "C_to_D"]),
    A_to_D = sum(count[combo == "A_to_D"]),
    C_to_B = sum(count[combo == "C_to_B"]),
    .groups = "drop"
  ) %>%
  mutate(
    Cis_Counts = A_to_B + C_to_D,
    Trans_Counts = A_to_D + C_to_B,
    Total_4Combos = Cis_Counts + Trans_Counts,
    Excluded_Counts = pmax(Total_All - Total_4Combos, 0),
    Percent_Cis = if_else(Total_4Combos > 0, 100 * Cis_Counts / Total_4Combos, NA_real_),
    Percent_Trans = if_else(Total_4Combos > 0, 100 * Trans_Counts / Total_4Combos, NA_real_),
    Percent_A_to_B_in_Cis = if_else(Cis_Counts > 0, 100 * A_to_B / Cis_Counts, NA_real_),
    Percent_C_to_D_in_Cis = if_else(Cis_Counts > 0, 100 * C_to_D / Cis_Counts, NA_real_),
    Percent_A_to_D_in_Trans = if_else(Trans_Counts > 0, 100 * A_to_D / Trans_Counts, NA_real_),
    Percent_C_to_B_in_Trans = if_else(Trans_Counts > 0, 100 * C_to_B / Trans_Counts, NA_real_)
  )

# Quick diagnostic table: if Excluded_Counts is large, then the 4-combo definition is omitting real counts.
agg_qc_combo4 %>%
  arrange(time_point, batch) %>%
  transmute(
    batch, time_point,
    Total_All,
    Total_4Combos,
    Excluded_Counts,
    Percent_Cis,
    Percent_Trans
  ) %>%
  head(20) %>%
  knitr::kable(caption = "QC (4-combo definition): totals vs excluded counts, plus CIS%/TRANS% (first 20 rows)")
QC (4-combo definition): totals vs excluded counts, plus CIS%/TRANS% (first 20 rows)
batch time_point Total_All Total_4Combos Excluded_Counts Percent_Cis Percent_Trans
4 0 962219 962219 0 99.60986 0.3901399
6 0 401728 401728 0 97.21976 2.7802394
8 0 404339 404339 0 96.41019 3.5898095
4 120 1053614 1053614 0 96.99387 3.0061294
6 120 253131 253131 0 95.00496 4.9950421
8 120 195628 195628 0 95.88300 4.1169976
# Formatting helpers (avoid extra package dependencies)
comma_label <- function(x) {
  ifelse(is.na(x), NA_character_, formatC(x, format = "f", digits = 0, big.mark = ","))
}

pct_label <- function(x, digits = 1) {
  ifelse(is.na(x), NA_character_, paste0(round(x, digits), "%"))
}

# 1) Total counts by batch
p_total_counts <- ggplot(agg_qc, aes(x = batch, y = Total_Counts, fill = DSB)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.75) +
  geom_text(
    aes(label = comma_label(Total_Counts)),
    position = position_dodge(width = 0.8),
    vjust = -0.25,
    size = 2.7
  ) +
  facet_wrap(~ time_point, scales = "free_y") +
  theme_bw() +
  labs(
    title = "Total counts by batch (faceted by time_point)",
    x = "Batch",
    y = "Total counts",
    fill = "DSB"
  )

if (nrow(agg_qc) > 0) {
  print(p_total_counts)
}

# 2) CIS-only counts by category within each batch (4-combo definition)
cis_counts_long <- agg_qc_combo4 %>%
  select(batch, time_point, Cis_Counts, A_to_B, C_to_D) %>%
  pivot_longer(
    cols = -c(batch, time_point),
    names_to = "Metric",
    values_to = "Counts"
  ) %>%
  mutate(
    Metric = factor(
      Metric,
      levels = c("Cis_Counts", "A_to_B", "C_to_D"),
      labels = c("CIS (total)", "A to B (cis)", "C to D (cis)")
    )
  )

p_cis_counts <- ggplot(cis_counts_long, aes(x = batch, y = Counts, fill = Metric)) +
  geom_col(position = position_dodge(width = 0.85), width = 0.8) +
  geom_text(
    aes(label = comma_label(Counts)),
    position = position_dodge(width = 0.85),
    vjust = -0.25,
    size = 2.4
  ) +
  facet_wrap(~ time_point, scales = "free_y") +
  theme_bw() +
  labs(
    title = "CIS counts by category per batch",
    x = "Batch",
    y = "Counts",
    fill = ""
  )

if (nrow(cis_counts_long) > 0) {
  print(p_cis_counts)
}

# 3) TRANS-only counts by category within each batch (4-combo definition)
trans_counts_long <- agg_qc_combo4 %>%
  select(batch, time_point, Trans_Counts, A_to_D, C_to_B) %>%
  pivot_longer(
    cols = -c(batch, time_point),
    names_to = "Metric",
    values_to = "Counts"
  ) %>%
  mutate(
    Metric = factor(
      Metric,
      levels = c("Trans_Counts", "A_to_D", "C_to_B"),
      labels = c("TRANS (total)", "A to D (trans)", "C to B (trans)")
    )
  )

p_trans_counts <- ggplot(trans_counts_long, aes(x = batch, y = Counts, fill = Metric)) +
  geom_col(position = position_dodge(width = 0.85), width = 0.8) +
  geom_text(
    aes(label = comma_label(Counts)),
    position = position_dodge(width = 0.85),
    vjust = -0.25,
    size = 2.4
  ) +
  facet_wrap(~ time_point, scales = "free_y") +
  theme_bw() +
  labs(
    title = "TRANS counts by category per batch",
    x = "Batch",
    y = "Counts",
    fill = ""
  )

if (nrow(trans_counts_long) > 0) {
  print(p_trans_counts)
}

# 4) CIS vs TRANS percent for each batch (single plot; bars side-by-side)
# Percent_Cis/Percent_Trans are computed relative to TOTAL_4Combos.
pct_cis_trans_long <- agg_qc_combo4 %>%
  select(batch, time_point, Percent_Cis, Percent_Trans) %>%
  pivot_longer(
    cols = c(Percent_Cis, Percent_Trans),
    names_to = "Class",
    values_to = "Percent"
  ) %>%
  mutate(
    Class = recode(Class, Percent_Cis = "CIS", Percent_Trans = "TRANS"),
    Label = pct_label(Percent)
  )

p_pct_cis_trans <- ggplot(pct_cis_trans_long, aes(x = batch, y = Percent, fill = Class)) +
  geom_col(position = position_dodge(width = 0.85), width = 0.8) +
  geom_text(
    aes(label = Label),
    position = position_dodge(width = 0.85),
    vjust = -0.25,
    size = 3.2
  ) +
  facet_wrap(~ time_point) +
  scale_y_continuous(limits = c(0, 110)) +
  theme_bw() +
  labs(
    title = "QC: CIS vs TRANS percent per batch (within group)",
    x = "Batch",
    y = "% of total counts",
    fill = ""
  )

if (nrow(pct_cis_trans_long) > 0 && any(!is.na(pct_cis_trans_long$Percent))) {
  print(p_pct_cis_trans)
}

# 5b) Percent TRANS per batch at T0 vs T120 (single plot; bars side-by-side)
pct_trans_time_compare <- agg_qc_combo4 %>%
  filter(time_point %in% c(0, 120)) %>%
  transmute(
    batch,
    time_point = factor(time_point, levels = c(0, 120), labels = c("T0", "T120")),
    Percent_Trans,
    Label = pct_label(Percent_Trans)
  )

p_pct_trans_t0_t120 <- ggplot(pct_trans_time_compare, aes(x = batch, y = Percent_Trans, fill = time_point)) +
  geom_col(position = position_dodge(width = 0.85), width = 0.8) +
  geom_text(
    aes(label = Label),
    position = position_dodge(width = 0.85),
    vjust = -0.25,
    size = 3.2
  ) +
  scale_y_continuous(limits = c(0, 110)) +
  theme_bw() +
  labs(
    title = "QC: TRANS% per batch (T0 vs T120)",
    x = "Batch",
    y = "TRANS% of total",
    fill = "Time"
  )

if (nrow(pct_trans_time_compare) > 0 && any(!is.na(pct_trans_time_compare$Percent_Trans))) {
  print(p_pct_trans_t0_t120)
}

# 6) Within-class combo composition (stacked; should sum to 100% within class)
# Use ONLY the two combos per class (A_to_B/C_to_D within CIS; A_to_D/C_to_B within TRANS)
agg_combo_within_2only <- agg_qc_combo4

cis_combo_long <- agg_combo_within_2only %>%
  select(batch, time_point, Percent_A_to_B_in_Cis, Percent_C_to_D_in_Cis) %>%
  pivot_longer(
    cols = c(Percent_A_to_B_in_Cis, Percent_C_to_D_in_Cis),
    names_to = "Combo",
    values_to = "Percent"
  ) %>%
  mutate(
    Combo = recode(
      Combo,
      Percent_A_to_B_in_Cis = "A_to_B (within CIS)",
      Percent_C_to_D_in_Cis = "C_to_D (within CIS)"
    )
  )

p_cis_combo_comp <- ggplot(cis_combo_long, aes(x = batch, y = Percent, fill = Combo)) +
  geom_col(width = 0.75) +
  geom_text(
    aes(label = if_else(is.na(Percent) | Percent <= 0, NA_character_, paste0(round(Percent, 1), "%"))),
    position = position_stack(vjust = 0.5),
    size = 2.4
  ) +
  facet_wrap(~ time_point) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_bw() +
  labs(
    title = "QC: Within-CIS composition (A_to_B + C_to_D = 100%)",
    x = "Batch",
    y = "% of CIS",
    fill = ""
  )

if (nrow(cis_combo_long) > 0 && any(!is.na(cis_combo_long$Percent))) {
  print(p_cis_combo_comp)
}

trans_combo_long <- agg_combo_within_2only %>%
  select(batch, time_point, Percent_A_to_D_in_Trans, Percent_C_to_B_in_Trans) %>%
  pivot_longer(
    cols = c(Percent_A_to_D_in_Trans, Percent_C_to_B_in_Trans),
    names_to = "Combo",
    values_to = "Percent"
  ) %>%
  mutate(
    Combo = recode(
      Combo,
      Percent_A_to_D_in_Trans = "A_to_D (within TRANS)",
      Percent_C_to_B_in_Trans = "C_to_B (within TRANS)"
    )
  )

p_trans_combo_comp <- ggplot(trans_combo_long, aes(x = batch, y = Percent, fill = Combo)) +
  geom_col(width = 0.75) +
  geom_text(
    aes(label = if_else(is.na(Percent) | Percent <= 0, NA_character_, paste0(round(Percent, 1), "%"))),
    position = position_stack(vjust = 0.5),
    size = 2.4
  ) +
  facet_wrap(~ time_point) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_bw() +
  labs(
    title = "QC: Within-TRANS composition (A_to_D + C_to_B = 100%)",
    x = "Batch",
    y = "% of TRANS",
    fill = ""
  )

if (nrow(trans_combo_long) > 0 && any(!is.na(trans_combo_long$Percent))) {
  print(p_trans_combo_comp)
}

3. cis/trans normalization — compute totals and percentages per allele

my_summarize_cistrans <- function(dat){
  by_allele <- dat %>%
    group_by(batch, time_point, DSB, allele) %>%
    summarise(
      Cis_Location_Counts   = sum(count[cis_trans == "CIS"]),
      Trans_Location_Counts = sum(count[cis_trans == "TRANS"]),
      .groups = "drop"
    )

  totals <- by_allele %>%
    group_by(batch, time_point, DSB) %>%
    summarise(
      Total_Cis_Location_Counts   = sum(Cis_Location_Counts),
      Total_Trans_Location_Counts = sum(Trans_Location_Counts),
      .groups = "drop"
    )

  by_allele %>%
    left_join(totals, by = c("batch", "time_point", "DSB")) %>%
    mutate(
      Percent_Location_in_Cis   = if_else(Total_Cis_Location_Counts > 0,
                                         100 * Cis_Location_Counts / Total_Cis_Location_Counts,
                                         NA_real_),
      Percent_Location_in_Trans = if_else(Total_Trans_Location_Counts > 0,
                                         100 * Trans_Location_Counts / Total_Trans_Location_Counts,
                                         NA_real_)
    )
}

dat_norm <- my_summarize_cistrans(dat_raw)
dat_norm |> head(10) |> knitr::kable(caption = "Preview: allele-level CIS/TRANS counts and within-class percentages (first 10)")
Preview: allele-level CIS/TRANS counts and within-class percentages (first 10)
batch time_point DSB allele Cis_Location_Counts Trans_Location_Counts Total_Cis_Location_Counts Total_Trans_Location_Counts Percent_Location_in_Cis Percent_Location_in_Trans
4 0 DSB1 Chr12_L01 23365 0 499775 0 4.6751038 NA
4 0 DSB1 Chr12_L05 15119 0 499775 0 3.0251613 NA
4 0 DSB1 Chr12_L07 19481 0 499775 0 3.8979541 NA
4 0 DSB1 Chr12_L12 22779 0 499775 0 4.5578510 NA
4 0 DSB1 Chr15_L01 27868 0 499775 0 5.5761092 NA
4 0 DSB1 Chr15_L04 54 0 499775 0 0.0108049 NA
4 0 DSB1 Chr15_L15 14456 0 499775 0 2.8925016 NA
4 0 DSB1 Chr15_L17_2 28338 0 499775 0 5.6701516 NA
4 0 DSB1 Chr15_L18 4689 0 499775 0 0.9382222 NA
4 0 DSB1 Chr3_L02 26669 0 499775 0 5.3362013 NA

4. Allele frequency (CIS + TRANS)

cis_combos <- c("A_to_B", "C_to_D")
trans_combos <- c("A_to_D", "C_to_B")

# From section 4 onward, we define:
#   CIS   := A_to_B + C_to_D
#   TRANS := A_to_D + C_to_B
# and ignore any other combos so each group has exactly two CIS counts.

my_summarize_cistrans_by_combo <- function(dat, cis_combos, trans_combos){
  by_allele <- dat %>%
    group_by(batch, time_point, DSB, allele) %>%
    summarise(
      Cis_Location_Counts   = sum(count[combo %in% cis_combos]),
      Trans_Location_Counts = sum(count[combo %in% trans_combos]),
      .groups = "drop"
    )

  totals <- by_allele %>%
    group_by(batch, time_point, DSB) %>%
    summarise(
      Total_Cis_Location_Counts   = sum(Cis_Location_Counts),
      Total_Trans_Location_Counts = sum(Trans_Location_Counts),
      .groups = "drop"
    )

  by_allele %>%
    left_join(totals, by = c("batch", "time_point", "DSB")) %>%
    mutate(
      # Group-total counts (restricted to the 4 combos only)
      Total_Group_Counts = Total_Cis_Location_Counts + Total_Trans_Location_Counts,

      # Requested semantics (section 4+):
      #   CIS%  = cis_count / total_count
      #   TRANS% = trans_count / total_count
      # where total_count is the TOTAL within the group (batch × time_point × DSB)
      # and CIS/TRANS are defined by the two combos each.
      # NOTE: These are per-allele *contributions* to the group's total.
      Percent_Cis_of_GroupTotal = if_else(Total_Group_Counts > 0,
                                         100 * Cis_Location_Counts / Total_Group_Counts,
                                         NA_real_),
      Percent_Trans_of_GroupTotal = if_else(Total_Group_Counts > 0,
                                           100 * Trans_Location_Counts / Total_Group_Counts,
                                           NA_real_),

      Percent_Location_in_Cis   = if_else(Total_Cis_Location_Counts > 0,
                                         100 * Cis_Location_Counts / Total_Cis_Location_Counts,
                                         NA_real_),
      Percent_Location_in_Trans = if_else(Total_Trans_Location_Counts > 0,
                                         100 * Trans_Location_Counts / Total_Trans_Location_Counts,
                                         NA_real_)
    )
}

# Filter to only the four combos used downstream
dat_focus <- dat_raw %>%
  filter(combo %in% c(cis_combos, trans_combos))

dat_norm_combo <- my_summarize_cistrans_by_combo(dat_focus, cis_combos, trans_combos)

# ---- Group-level summaries using ONLY the four combos (section 4+ semantics) ----
# These are the plots that should behave like:
#   - A_to_B + C_to_D = 100% (within CIS)
#   - A_to_D + C_to_B = 100% (within TRANS)
#   - TRANS% of total is typically lower at T0 than T120 (if biology matches expectation)

dat_group4 <- dat_focus %>%
  # IMPORTANT: do NOT group by DSB here.
  # In this dataset, `DSB` may contain levels like "TRANS" which would split CIS and TRANS
  # across panels and yield misleading 100% bars.
  group_by(batch, time_point) %>%
  summarise(
    A_to_B = sum(count[combo == "A_to_B"]),
    C_to_D = sum(count[combo == "C_to_D"]),
    A_to_D = sum(count[combo == "A_to_D"]),
    C_to_B = sum(count[combo == "C_to_B"]),
    .groups = "drop"
  ) %>%
  mutate(
    Cis_Total = A_to_B + C_to_D,
    Trans_Total = A_to_D + C_to_B,
    Total = Cis_Total + Trans_Total,

    Percent_Cis_of_Total = if_else(Total > 0, 100 * Cis_Total / Total, NA_real_),
    Percent_Trans_of_Total = if_else(Total > 0, 100 * Trans_Total / Total, NA_real_),

    # These should sum to ~100 (within class) when Cis_Total/Trans_Total > 0
    Percent_A_to_B_in_Cis = if_else(Cis_Total > 0, 100 * A_to_B / Cis_Total, NA_real_),
    Percent_C_to_D_in_Cis = if_else(Cis_Total > 0, 100 * C_to_D / Cis_Total, NA_real_),
    Percent_A_to_D_in_Trans = if_else(Trans_Total > 0, 100 * A_to_D / Trans_Total, NA_real_),
    Percent_C_to_B_in_Trans = if_else(Trans_Total > 0, 100 * C_to_B / Trans_Total, NA_real_)
  )

# Share of total CIS/TRANS counts by batch (across ALL batches), separated by time_point.
# This treats the plots as "how much of the total CIS (or TRANS) at this time point comes from each batch".
dat_group4_share <- dat_group4 %>%
  group_by(time_point) %>%
  mutate(
    Total_Cis_AllBatches = sum(Cis_Total),
    Total_Trans_AllBatches = sum(Trans_Total),
    Percent_Cis_Share = if_else(Total_Cis_AllBatches > 0, 100 * Cis_Total / Total_Cis_AllBatches, NA_real_),
    Percent_Trans_Share = if_else(Total_Trans_AllBatches > 0, 100 * Trans_Total / Total_Trans_AllBatches, NA_real_)
  )

p_group4_trans_total <- ggplot(dat_group4_share, aes(x = batch, y = Percent_Trans_Share)) +
  geom_col(width = 0.75) +
  geom_text(aes(label = if_else(is.na(Percent_Trans_Share), NA_character_, paste0(round(Percent_Trans_Share, 1), "%"))),
            vjust = -0.25, size = 3.2) +
  facet_wrap(~ time_point) +
  scale_y_continuous(limits = c(0, 110)) +
  theme_bw() +
  labs(
    title = "Share of total TRANS counts by batch (A_to_D + C_to_B only)",
    x = "Batch",
    y = "% of total TRANS (within time point)"
  )

if (nrow(dat_group4_share) > 0 && any(!is.na(dat_group4_share$Percent_Trans_Share))) {
  print(p_group4_trans_total)
}

p_group4_cis_total <- ggplot(dat_group4_share, aes(x = batch, y = Percent_Cis_Share)) +
  geom_col(width = 0.75) +
  geom_text(aes(label = if_else(is.na(Percent_Cis_Share), NA_character_, paste0(round(Percent_Cis_Share, 1), "%"))),
            vjust = -0.25, size = 3.2) +
  facet_wrap(~ time_point) +
  scale_y_continuous(limits = c(0, 110)) +
  theme_bw() +
  labs(
    title = "Share of total CIS counts by batch (A_to_B + C_to_D only)",
    x = "Batch",
    y = "% of total CIS (within time point)"
  )

if (nrow(dat_group4_share) > 0 && any(!is.na(dat_group4_share$Percent_Cis_Share))) {
  print(p_group4_cis_total)
}

# Within-CIS composition: A_to_B vs C_to_D (should sum to 100%)
cis_comp_long4 <- dat_group4 %>%
  select(batch, time_point, Percent_A_to_B_in_Cis, Percent_C_to_D_in_Cis) %>%
  pivot_longer(
    cols = c(Percent_A_to_B_in_Cis, Percent_C_to_D_in_Cis),
    names_to = "Combo",
    values_to = "Percent"
  ) %>%
  mutate(
    Combo = recode(
      Combo,
      Percent_A_to_B_in_Cis = "A_to_B (within CIS)",
      Percent_C_to_D_in_Cis = "C_to_D (within CIS)"
    )
  )

p_cis_comp_2only <- ggplot(cis_comp_long4, aes(x = batch, y = Percent, fill = Combo)) +
  geom_col(width = 0.75) +
  geom_text(
    aes(label = if_else(is.na(Percent) | Percent <= 0, NA_character_, paste0(round(Percent, 1), "%"))),
    position = position_stack(vjust = 0.5),
    size = 2.4
  ) +
  facet_wrap(~ time_point) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_bw() +
  labs(
    title = "Within-CIS composition (A_to_B + C_to_D = 100%)",
    x = "Batch",
    y = "% of CIS",
    fill = ""
  )

if (nrow(cis_comp_long4) > 0 && any(!is.na(cis_comp_long4$Percent))) {
  print(p_cis_comp_2only)
}

# Within-TRANS composition: A_to_D vs C_to_B (should sum to 100%)
trans_comp_long4 <- dat_group4 %>%
  select(batch, time_point, Percent_A_to_D_in_Trans, Percent_C_to_B_in_Trans) %>%
  pivot_longer(
    cols = c(Percent_A_to_D_in_Trans, Percent_C_to_B_in_Trans),
    names_to = "Combo",
    values_to = "Percent"
  ) %>%
  mutate(
    Combo = recode(
      Combo,
      Percent_A_to_D_in_Trans = "A_to_D (within TRANS)",
      Percent_C_to_B_in_Trans = "C_to_B (within TRANS)"
    )
  )

p_trans_comp_2only <- ggplot(trans_comp_long4, aes(x = batch, y = Percent, fill = Combo)) +
  geom_col(width = 0.75) +
  geom_text(
    aes(label = if_else(is.na(Percent) | Percent <= 0, NA_character_, paste0(round(Percent, 1), "%"))),
    position = position_stack(vjust = 0.5),
    size = 2.4
  ) +
  facet_wrap(~ time_point) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_bw() +
  labs(
    title = "Within-TRANS composition (A_to_D + C_to_B = 100%)",
    x = "Batch",
    y = "% of TRANS",
    fill = ""
  )

if (nrow(trans_comp_long4) > 0 && any(!is.na(trans_comp_long4$Percent))) {
  print(p_trans_comp_2only)
}

my_summarize_allelefreq <- function(dat){
  by_allele <- dat %>%
    group_by(batch, time_point, DSB, allele) %>%
    summarise(
      Cis_Counts    = sum(count[combo %in% cis_combos]),
      Trans_Counts  = sum(count[combo %in% trans_combos]),
      Allele_Total  = Cis_Counts + Trans_Counts,
      .groups = "drop"
    )

  totals <- by_allele %>%
    group_by(batch, time_point, DSB) %>%
    summarise(
      Total_Cis   = sum(Cis_Counts),
      Total_Trans = sum(Trans_Counts),
      Total_All   = Total_Cis + Total_Trans,
      .groups = "drop"
    )

  by_allele %>%
    left_join(totals, by = c("batch", "time_point", "DSB")) %>%
    mutate(
      Allele_Frequency = if_else(Total_All > 0, Allele_Total / Total_All, NA_real_)
    )
}

dat_allele_freq <- my_summarize_allelefreq(dat_focus)
str(dat_allele_freq)
## tibble [304 × 11] (S3: tbl_df/tbl/data.frame)
##  $ batch           : Factor w/ 3 levels "4","6","8": 1 1 1 1 1 1 1 1 1 1 ...
##  $ time_point      : num [1:304] 0 0 0 0 0 0 0 0 0 0 ...
##  $ DSB             : chr [1:304] "DSB1" "DSB1" "DSB1" "DSB1" ...
##  $ allele          : chr [1:304] "Chr12_L01" "Chr12_L05" "Chr12_L07" "Chr12_L12" ...
##  $ Cis_Counts      : num [1:304] 23365 15119 19481 22779 27868 ...
##  $ Trans_Counts    : num [1:304] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Allele_Total    : num [1:304] 23365 15119 19481 22779 27868 ...
##  $ Total_Cis       : num [1:304] 5e+05 5e+05 5e+05 5e+05 5e+05 ...
##  $ Total_Trans     : num [1:304] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Total_All       : num [1:304] 5e+05 5e+05 5e+05 5e+05 5e+05 ...
##  $ Allele_Frequency: num [1:304] 0.0468 0.0303 0.039 0.0456 0.0558 ...

5. Fold change calculations (120 / 0)

library(tidyr)

# Fold change helper.
# We compute FC per allele/location as (T120 / T0) on COUNTS.
# If both numerator and denominator are 0, the fold change is undefined -> NA.
eps <- 1e-6
fc_ratio <- function(num, den, eps = 1e-6) {
  ifelse(is.na(num) | is.na(den), NA_real_,
         ifelse(num == 0 & den == 0, NA_real_, (num + eps) / (den + eps)))
}

dat_fc_cis <- dat_norm_combo %>%
  filter(time_point %in% c(0, 120)) %>%
  select(batch, time_point, DSB, allele, Cis_Location_Counts) %>%
  pivot_wider(names_from = time_point, values_from = Cis_Location_Counts, values_fill = 0) %>%
  mutate(
    FoldChange_Cis_120_vs_0 = fc_ratio(`120`, `0`, eps = eps),
    Log2FC_Cis_120_vs_0 = log2(FoldChange_Cis_120_vs_0)
  )

dat_fc_trans <- dat_norm_combo %>%
  filter(time_point %in% c(0, 120)) %>%
  select(batch, time_point, DSB, allele, Trans_Location_Counts) %>%
  pivot_wider(names_from = time_point, values_from = Trans_Location_Counts, values_fill = 0) %>%
  mutate(
    FoldChange_Trans_120_vs_0 = fc_ratio(`120`, `0`, eps = eps),
    Log2FC_Trans_120_vs_0 = log2(FoldChange_Trans_120_vs_0)
  )

6. Correlation: log2 Fold Change vs Allele Frequency

library(tidyr)

if (!exists("eps", inherits = TRUE)) eps <- 1e-6
if (!exists("fc_ratio", inherits = TRUE)) {
  fc_ratio <- function(num, den, eps = 1e-6) {
    ifelse(is.na(num) | is.na(den), NA_real_,
           ifelse(num == 0 & den == 0, NA_real_, (num + eps) / (den + eps)))
  }
}

dat_wide <- dat_norm_combo %>%
  filter(time_point %in% c(0, 120)) %>%
  select(batch, DSB, allele, time_point, Cis_Location_Counts, Trans_Location_Counts) %>%
  pivot_wider(names_from = time_point, values_from = c(Cis_Location_Counts, Trans_Location_Counts), values_fill = 0) %>%
  mutate(
    log2FC_CIS   = log2(fc_ratio(Cis_Location_Counts_120, Cis_Location_Counts_0, eps = eps)),
    log2FC_TRANS = log2(fc_ratio(Trans_Location_Counts_120, Trans_Location_Counts_0, eps = eps))
  )

dat_fc_af <- dat_wide %>%
  inner_join(
    dat_allele_freq %>% filter(time_point == 120) %>% select(batch, DSB, allele, Allele_Frequency),
    by = c("batch", "DSB", "allele")
  ) %>%
  filter(!is.na(Allele_Frequency) & Allele_Frequency > 0)

cor_summary <- dat_fc_af %>%
  group_by(batch, DSB) %>%
  summarise(
    cor_CIS_AF   = ifelse(n() >= 2, cor(log2FC_CIS, Allele_Frequency), NA),
    cor_TRANS_AF = ifelse(n() >= 2, cor(log2FC_TRANS, Allele_Frequency), NA),
    n_obs = n(),
    .groups = "drop"
  )

knitr::kable(cor_summary, caption = "Correlation summary: Pearson r between log2FC and allele frequency (by batch × DSB)")
Correlation summary: Pearson r between log2FC and allele frequency (by batch × DSB)
batch DSB cor_CIS_AF cor_TRANS_AF n_obs
4 DSB1 0.7330286 NA 24
4 DSB2 -1.0000000 NA 2
4 TRANS NA 0.5593380 26
6 DSB1 0.4087059 NA 24
6 DSB2 -1.0000000 NA 2
6 TRANS NA 0.4852722 24
8 DSB1 0.6913647 NA 24
8 DSB2 1.0000000 NA 2
8 TRANS NA 0.6118651 24

7a. CIS percentage bar plot

# CIS% contribution (within CIS only):
#   for each group (batch × time_point × DSB), we compute each allele's share of TOTAL CIS,
#   using ONLY the CIS combos (A_to_B + C_to_D).
df_cis_dist <- dat_norm_combo %>%
  filter(time_point %in% c(0, 120)) %>%
  select(batch, time_point, DSB, allele, Percent_Location_in_Cis)

plot_cis_contrib_by_batch <- function(df_cis_dist) {
  df_cis_dist <- df_cis_dist %>%
    mutate(
      time_point = factor(time_point, levels = c(0, 120), labels = c("T0", "T120")),
      DSB = factor(DSB, levels = c("DSB1", "DSB2", "TRANS"))
    )

  batches <- df_cis_dist %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
  for (b in batches) {
    df_plot <- df_cis_dist %>% filter(batch == b)

    if (nrow(df_plot) == 0 || all(is.na(df_plot$Percent_Location_in_Cis))) next

    p <- ggplot(df_plot, aes(x = allele, y = Percent_Location_in_Cis, fill = DSB)) +
      geom_col(position = position_dodge(width = 0.9), width = 0.85) +
      geom_text(
        aes(label = if_else(is.na(Percent_Location_in_Cis), NA_character_, paste0(round(Percent_Location_in_Cis, 1), "%"))),
        position = position_dodge(width = 0.9),
        vjust = -0.25,
        size = 3.0,
        angle = 90
      ) +
      facet_wrap(~ time_point, scales = "free_x") +
      scale_y_continuous(limits = c(0, 110)) +
      theme_bw() +
      theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) +
      labs(
        title = paste0("CIS% contribution by allele (within CIS only) | Batch: ", b),
        x = "Allele",
        y = "CIS% of CIS total (within group)",
        fill = "DSB"
      )

    print(p)
  }
}

plot_cis_contrib_by_batch(df_cis_dist)
## Warning: Removed 52 rows containing missing values or values outside the scale
## range (`geom_col()`).
## Warning: Removed 52 rows containing missing values or values outside the scale
## range (`geom_text()`).

## Warning: Removed 48 rows containing missing values or values outside the scale
## range (`geom_col()`).
## Warning: Removed 48 rows containing missing values or values outside the scale
## range (`geom_text()`).

## Warning: Removed 48 rows containing missing values or values outside the scale range (`geom_col()`).
## Removed 48 rows containing missing values or values outside the scale range (`geom_text()`).

# 7b. TRANS percentage bar plot

# TRANS% contribution (within TRANS only):
#   for each group (batch × time_point × DSB), we compute each allele's share of TOTAL TRANS,
#   using ONLY the TRANS combos (A_to_D + C_to_B).

df_trans_dist <- dat_norm_combo %>%
  filter(time_point %in% c(0, 120)) %>%
  select(batch, time_point, DSB, allele, Percent_Location_in_Trans)

plot_trans_contrib_by_batch <- function(df_trans_dist) {
  df_trans_dist <- df_trans_dist %>%
    mutate(
      time_point = factor(time_point, levels = c(0, 120), labels = c("T0", "T120")),
      DSB = factor(DSB, levels = c("DSB1", "DSB2", "TRANS"))
    )

  batches <- df_trans_dist %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
  for (b in batches) {
    df_plot <- df_trans_dist %>% filter(batch == b)

    if (nrow(df_plot) == 0 || all(is.na(df_plot$Percent_Location_in_Trans))) next

    p <- ggplot(df_plot, aes(x = allele, y = Percent_Location_in_Trans, fill = DSB)) +
      geom_col(position = position_dodge(width = 0.9), width = 0.85) +
      geom_text(
        aes(label = if_else(is.na(Percent_Location_in_Trans), NA_character_, paste0(round(Percent_Location_in_Trans, 1), "%"))),
        position = position_dodge(width = 0.9),
        vjust = -0.25,
        size = 3.0,
        angle = 90
      ) +
      facet_wrap(~ time_point, scales = "free_x") +
      scale_y_continuous(limits = c(0, 110)) +
      theme_bw() +
      theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) +
      labs(
        title = paste0("TRANS% contribution by allele (within TRANS only) | Batch: ", b),
        x = "Allele",
        y = "TRANS% of TRANS total (within group)",
        fill = "DSB"
      )

    print(p)
  }
}

plot_trans_contrib_by_batch(df_trans_dist)
## Warning: Removed 52 rows containing missing values or values outside the scale
## range (`geom_col()`).
## Warning: Removed 52 rows containing missing values or values outside the scale
## range (`geom_text()`).

## Warning: Removed 52 rows containing missing values or values outside the scale range (`geom_col()`).
## Removed 52 rows containing missing values or values outside the scale range (`geom_text()`).

## Warning: Removed 52 rows containing missing values or values outside the scale range (`geom_col()`).
## Removed 52 rows containing missing values or values outside the scale range (`geom_text()`).

# Optional: TRANS distribution broken down by combo (A_to_D vs C_to_B)
# Requested view: T0 and T120 on the SAME graph with different-colored bars.
plot_trans_percent_by_combo <- function(dat, combo_name) {
  combo_map <- c(
    "A to D" = "A_to_D",
    "A_to_D" = "A_to_D",
    "B to C" = "C_to_B",
    "B_to_C" = "C_to_B",
    "C to B" = "C_to_B",
    "C_to_B" = "C_to_B"
  )
  if (!is.null(combo_map[[combo_name]])) {
    combo_name <- combo_map[[combo_name]]
  }

  df_plot <- dat %>%
    filter(combo == combo_name, time_point %in% c(0, 120)) %>%
    group_by(batch, time_point, DSB, allele) %>%
    summarise(
      Trans_Counts = sum(count),
      .groups = "drop"
    ) %>%
    group_by(batch, time_point, DSB) %>%
    mutate(
      Total_Trans = sum(Trans_Counts),
      Percent_Trans = if_else(Total_Trans > 0, 100 * Trans_Counts / Total_Trans, NA_real_)
    )

  if (nrow(df_plot) == 0 || all(is.na(df_plot$Percent_Trans))) return(NULL)

  ggplot(
      df_plot,
      aes(
        x = allele,
        y = Percent_Trans,
        fill = factor(time_point, levels = c(0, 120), labels = c("T0", "T120"))
      )
    ) +
      geom_col(position = position_dodge(width = 0.9), width = 0.85) +
      geom_text(
        aes(label = if_else(is.na(Percent_Trans), NA_character_, paste0(round(Percent_Trans, 1), "%"))),
        position = position_dodge(width = 0.9),
        vjust = -0.25,
        size = 3.0,
        angle = 90
      ) +
      facet_grid(batch + DSB ~ ., scales = "free_x") +
      scale_y_continuous(limits = c(0, 110)) +
      theme_bw() +
      theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) +
      labs(
        title = paste("Percent of", combo_name, "counts by allele (within combo; T0 vs T120)"),
        x = "Allele",
        y = "% within combo (within group)",
        fill = "Time"
      )
}

p_trans_ad <- plot_trans_percent_by_combo(dat_focus, "A_to_D")
if (!is.null(p_trans_ad)) print(p_trans_ad)

p_trans_cb <- plot_trans_percent_by_combo(dat_focus, "C_to_B")
if (!is.null(p_trans_cb)) print(p_trans_cb)

*** # 8. Fold Change bar plot

plot_foldchange_cis_by_batch <- function(dat_fc_cis) {
  batches <- dat_fc_cis %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
  for (b in batches) {
    df_plot <- dat_fc_cis %>% filter(batch == b)

    if (nrow(df_plot) == 0 || all(is.na(df_plot$FoldChange_Cis_120_vs_0))) next
    p <- ggplot(df_plot, aes(x = allele, y = FoldChange_Cis_120_vs_0, fill = DSB)) +
      geom_col(position = position_dodge(width = 0.9), width = 0.85) +
      geom_text(
        aes(label = if_else(is.na(FoldChange_Cis_120_vs_0), NA_character_, sprintf("%.3f", FoldChange_Cis_120_vs_0))),
        position = position_dodge(width = 0.9),
        vjust = -0.25,
        size = 3.0,
        angle = 90
      ) +
      scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
      theme_bw(base_size = 14) +
      theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) +
      labs(
        title = paste0("Fold change (120/0) in CIS counts by allele | Batch: ", b),
        x = "Allele",
        y = "Fold change of CIS counts (120/0)",
        fill = "DSB"
      )
    print(p)
  }
}

plot_foldchange_trans_by_batch <- function(dat_fc_trans) {
  batches <- dat_fc_trans %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
  for (b in batches) {
    df_plot <- dat_fc_trans %>% filter(batch == b)

    if (nrow(df_plot) == 0 || all(is.na(df_plot$FoldChange_Trans_120_vs_0))) next
    p <- ggplot(df_plot, aes(x = allele, y = FoldChange_Trans_120_vs_0, fill = DSB)) +
      geom_col(position = position_dodge(width = 0.9), width = 0.85) +
      geom_text(
        aes(label = if_else(is.na(FoldChange_Trans_120_vs_0), NA_character_, sprintf("%.3f", FoldChange_Trans_120_vs_0))),
        position = position_dodge(width = 0.9),
        vjust = -0.25,
        size = 3.0,
        angle = 90
      ) +
      scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
      theme_bw(base_size = 14) +
      theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) +
      labs(
        title = paste0("Fold change (120/0) in TRANS counts by allele | Batch: ", b),
        x = "Allele",
        y = "Fold change of TRANS counts (120/0)",
        fill = "DSB"
      )
    print(p)
  }
}

plot_allele_frequency_by_batch <- function(dat_allele_freq) {
  batches <- dat_allele_freq %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
  for (b in batches) {
    df_plot <- dat_allele_freq %>% filter(batch == b)

    if (nrow(df_plot) == 0 || all(is.na(df_plot$Allele_Frequency))) next
    p <- ggplot(df_plot, aes(x = allele, y = Allele_Frequency, fill = DSB)) +
      geom_col(position = position_dodge(width = 0.9), width = 0.85) +
      geom_text(
        aes(label = if_else(is.na(Allele_Frequency), NA_character_, sprintf("%.3f", Allele_Frequency))),
        position = position_dodge(width = 0.9),
        vjust = -0.25,
        size = 3.0,
        angle = 90
      ) +
      scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
      facet_wrap(~ time_point, scales = "free_x") +
      theme_bw(base_size = 14) +
      theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) +
      labs(
        title = paste0("Allele Frequency (CIS + TRANS) | Batch: ", b),
        x = "Allele",
        y = "Allele Frequency",
        fill = "DSB"
      )
    print(p)
  }
}

# Generate updated plots
plot_foldchange_cis_by_batch(dat_fc_cis)
## Warning: Removed 26 rows containing missing values or values outside the scale
## range (`geom_col()`).
## Warning: Removed 26 rows containing missing values or values outside the scale
## range (`geom_text()`).

## Warning: Removed 24 rows containing missing values or values outside the scale
## range (`geom_col()`).
## Warning: Removed 24 rows containing missing values or values outside the scale
## range (`geom_text()`).

## Warning: Removed 24 rows containing missing values or values outside the scale range (`geom_col()`).
## Removed 24 rows containing missing values or values outside the scale range (`geom_text()`).

plot_foldchange_trans_by_batch(dat_fc_trans)
## Warning: Removed 26 rows containing missing values or values outside the scale
## range (`geom_col()`).
## Warning: Removed 26 rows containing missing values or values outside the scale
## range (`geom_text()`).

## Warning: Removed 26 rows containing missing values or values outside the scale range (`geom_col()`).
## Removed 26 rows containing missing values or values outside the scale range (`geom_text()`).

## Warning: Removed 26 rows containing missing values or values outside the scale range (`geom_col()`).
## Removed 26 rows containing missing values or values outside the scale range (`geom_text()`).

plot_allele_frequency_by_batch(dat_allele_freq)

*** # 9. Correlation scatter plots (log2FC vs Allele Frequency)

use_repel <- requireNamespace("ggrepel", quietly = TRUE)

plot_correlation_cis <- function(dat_fc_af, batch_name, dsb_name) {
  df_plot <- dat_fc_af %>% filter(batch == batch_name, DSB == dsb_name)
  if (nrow(df_plot) >= 2) {
    p <- ggplot(df_plot, aes(x = Allele_Frequency, y = log2FC_CIS, label = allele)) +
      geom_point(size = 3, alpha = 0.8, color = "darkgreen") +
      geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
      geom_smooth(method = "lm", se = FALSE, color = "red") +
      theme_bw() +
      labs(
        title = paste0("Log2FC CIS vs Allele Freq | Batch: ", batch_name, " | DSB: ", dsb_name),
        x = "Allele Frequency", y = "log2FC CIS"
      )

    if (use_repel) {
      p <- p + ggrepel::geom_text_repel(size = 3, max.overlaps = 20)
    } else {
      p <- p + geom_text(size = 3, check_overlap = TRUE, vjust = -0.25)
    }

    p
  }
}

plot_correlation_trans <- function(dat_fc_af, batch_name, dsb_name) {
  df_plot <- dat_fc_af %>% filter(batch == batch_name, DSB == dsb_name)
  if (nrow(df_plot) >= 2) {
    p <- ggplot(df_plot, aes(x = Allele_Frequency, y = log2FC_TRANS, label = allele)) +
      geom_point(size = 3, alpha = 0.8, color = "steelblue") +
      geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
      geom_smooth(method = "lm", se = FALSE, color = "red") +
      theme_bw() +
      labs(
        title = paste0("Log2FC TRANS vs Allele Freq | Batch: ", batch_name, " | DSB: ", dsb_name),
        x = "Allele Frequency", y = "log2FC TRANS"
      )

    if (use_repel) {
      p <- p + ggrepel::geom_text_repel(size = 3, max.overlaps = 20)
    } else {
      p <- p + geom_text(size = 3, check_overlap = TRUE, vjust = -0.25)
    }

    p
  }
}

# Generate plots per batch × DSB (no pooling)
plot_keys <- dat_fc_af %>% distinct(batch, DSB) %>% arrange(batch, DSB)
for (i in seq_len(nrow(plot_keys))) {
  b <- plot_keys$batch[[i]]
  d <- plot_keys$DSB[[i]]
  p_cis <- plot_correlation_cis(dat_fc_af, b, d)
  p_trans <- plot_correlation_trans(dat_fc_af, b, d)
  if (!is.null(p_cis)) print(p_cis)
  if (!is.null(p_trans)) print(p_trans)
}
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 24 rows containing missing values or values outside the scale
## range (`geom_point()`).
## Warning: Removed 24 rows containing missing values or values outside the scale
## range (`geom_text_repel()`).

## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale
## range (`geom_point()`).
## Warning: Removed 2 rows containing missing values or values outside the scale
## range (`geom_text_repel()`).

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 26 rows containing missing values or values outside the scale
## range (`geom_point()`).
## Warning: Removed 26 rows containing missing values or values outside the scale
## range (`geom_text_repel()`).

## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?

## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 24 rows containing missing values or values outside the scale
## range (`geom_point()`).
## Warning: Removed 24 rows containing missing values or values outside the scale
## range (`geom_text_repel()`).

## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale
## range (`geom_point()`).
## Warning: Removed 2 rows containing missing values or values outside the scale
## range (`geom_text_repel()`).

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 24 rows containing missing values or values outside the scale
## range (`geom_point()`).
## Warning: Removed 24 rows containing missing values or values outside the scale
## range (`geom_text_repel()`).

## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?

## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 24 rows containing missing values or values outside the scale
## range (`geom_point()`).
## Warning: Removed 24 rows containing missing values or values outside the scale
## range (`geom_text_repel()`).

## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale
## range (`geom_point()`).
## Warning: Removed 2 rows containing missing values or values outside the scale
## range (`geom_text_repel()`).

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 24 rows containing missing values or values outside the scale
## range (`geom_point()`).
## Warning: Removed 24 rows containing missing values or values outside the scale
## range (`geom_text_repel()`).

## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?